Dataset Description

Spotify Tracks Dataset is about playlist features and how they compare by genre or album/artist ids w.r.t popularity, acoustic ness, danceability, energy, instrumentals, key (musical chords) etc. Since all of us commonly listen to music, we found it interesting to explore various factors and their influence on playlists that become popular.

About the features

It seems that Spotify has defined features based on various audio acoustic features such as Shimmer, pitch, tone, fundamental frequency, etc as well as musical acoustics. https://developer.spotify.com/documentation/web-api/reference/#endpoint-get-audio-features The audio features seem to be based on pitch ranges, harmonics, overtones of musical instruments along with known vocal quality metrics based on features such as jitter, shimmer, pitch, tone etc. Each musical acoustics, as well as voice acoustics, are analyzed based on defined audio signal processing standards.

Goal

Get data from url to a Tibble.

#rcurl <- getURL("https://www.kaggle.com/zaheenhamidani/ultimate-spotify-tracks-db/download")
data_url <- "https://raw.githubusercontent.com/sushmaakoju/spotify-tracks-data-analysis/main/SpotifyFeatures.csv"
column_names <- c("genre","artist_name","track_name","track_id","popularity","acousticness","danceability","duration_ms","energy","instrumentalness","key","liveness","loudness","mode","speechiness","tempo","time_signature","valence")
spotifydf <- read_csv(data_url, show_col_types=FALSE)
head(spotifydf)
## # A tibble: 6 x 18
##   genre artist_name  track_name   track_id  popularity acousticness danceability
##   <chr> <chr>        <chr>        <chr>          <dbl>        <dbl>        <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6ga~          0        0.611        0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nfo~          1        0.246        0.59 
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzoN~          3        0.952        0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm5~          0        0.703        0.24 
## 5 Movie Fabien Nataf Ouverture    0IuslXpM~          4        0.95         0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa8~          0        0.749        0.578
## # ... with 11 more variables: duration_ms <dbl>, energy <dbl>,
## #   instrumentalness <dbl>, key <chr>, liveness <dbl>, loudness <dbl>,
## #   mode <chr>, speechiness <dbl>, tempo <dbl>, time_signature <chr>,
## #   valence <dbl>
colSums(is.na(spotifydf))
##            genre      artist_name       track_name         track_id 
##                0                0                0                0 
##       popularity     acousticness     danceability      duration_ms 
##                0                0                0                0 
##           energy instrumentalness              key         liveness 
##                0                0                0                0 
##         loudness             mode      speechiness            tempo 
##                0                0                0                0 
##   time_signature          valence 
##                0                0
head( spotifydf)
## # A tibble: 6 x 18
##   genre artist_name  track_name   track_id  popularity acousticness danceability
##   <chr> <chr>        <chr>        <chr>          <dbl>        <dbl>        <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6ga~          0        0.611        0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nfo~          1        0.246        0.59 
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzoN~          3        0.952        0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm5~          0        0.703        0.24 
## 5 Movie Fabien Nataf Ouverture    0IuslXpM~          4        0.95         0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa8~          0        0.749        0.578
## # ... with 11 more variables: duration_ms <dbl>, energy <dbl>,
## #   instrumentalness <dbl>, key <chr>, liveness <dbl>, loudness <dbl>,
## #   mode <chr>, speechiness <dbl>, tempo <dbl>, time_signature <chr>,
## #   valence <dbl>
glimpse(spotifydf)
## Rows: 232,725
## Columns: 18
## $ genre            <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie",~
## $ artist_name      <chr> "Henri Salvador", "Martin & les fées", "Joseph Willia~
## $ track_name       <chr> "C'est beau de faire un Show", "Perdu d'avance (par G~
## $ track_id         <chr> "0BRjO6ga9RKCKjfDqeFgWV", "0BjC1NfoEOOusryehmNudP", "~
## $ popularity       <dbl> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, ~
## $ acousticness     <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,~
## $ danceability     <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41~
## $ duration_ms      <dbl> 99373, 137373, 170267, 152427, 82625, 160627, 212293,~
## $ energy           <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270~
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123~
## $ key              <chr> "C#", "F#", "C", "C#", "F", "C#", "C#", "F#", "C", "G~
## $ liveness         <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105~
## $ loudness         <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -~
## $ mode             <chr> "Major", "Minor", "Minor", "Major", "Major", "Major",~
## $ speechiness      <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953~
## $ tempo            <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8~
## $ time_signature   <chr> "4/4", "4/4", "5/4", "4/4", "4/4", "4/4", "4/4", "4/4~
## $ valence          <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533~

Including Plots

Some genres are duplicated. (encoding mismatches).

na.omit(spotifydf)
## # A tibble: 232,725 x 18
##    genre artist_name  track_name   track_id popularity acousticness danceability
##    <chr> <chr>        <chr>        <chr>         <dbl>        <dbl>        <dbl>
##  1 Movie Henri Salva~ C'est beau ~ 0BRjO6g~          0      0.611          0.389
##  2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nf~          1      0.246          0.59 
##  3 Movie Joseph Will~ Don't Let M~ 0CoSDzo~          3      0.952          0.663
##  4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm~          0      0.703          0.24 
##  5 Movie Fabien Nataf Ouverture    0IuslXp~          4      0.95           0.331
##  6 Movie Henri Salva~ Le petit so~ 0Mf1jKa~          0      0.749          0.578
##  7 Movie Martin & le~ Premières r~ 0NUiKYR~          2      0.344          0.703
##  8 Movie Laura Mayne  Let Me Let ~ 0PbIF9Y~         15      0.939          0.416
##  9 Movie Chorus       Helka        0ST6uPf~          0      0.00104        0.734
## 10 Movie Le Club des~ Les bisous ~ 0VSqZ3K~         10      0.319          0.598
## # ... with 232,715 more rows, and 11 more variables: duration_ms <dbl>,
## #   energy <dbl>, instrumentalness <dbl>, key <chr>, liveness <dbl>,
## #   loudness <dbl>, mode <chr>, speechiness <dbl>, tempo <dbl>,
## #   time_signature <chr>, valence <dbl>
genres <- distinct(spotifydf, genre)$genre
genres
##  [1] "Movie"            "R&B"              "A Capella"        "Alternative"     
##  [5] "Country"          "Dance"            "Electronic"       "Anime"           
##  [9] "Folk"             "Blues"            "Opera"            "Hip-Hop"         
## [13] "Children's Music" "Children’s Music" "Rap"              "Indie"           
## [17] "Classical"        "Pop"              "Reggae"           "Reggaeton"       
## [21] "Jazz"             "Rock"             "Ska"              "Comedy"          
## [25] "Soul"             "Soundtrack"       "World"
spotifydf$genre[spotifydf$genre=="Children's Music"] <- "Children’s Music"
spotifydf[!duplicated(spotifydf$track_id),]
## # A tibble: 176,774 x 18
##    genre artist_name  track_name   track_id popularity acousticness danceability
##    <chr> <chr>        <chr>        <chr>         <dbl>        <dbl>        <dbl>
##  1 Movie Henri Salva~ C'est beau ~ 0BRjO6g~          0      0.611          0.389
##  2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nf~          1      0.246          0.59 
##  3 Movie Joseph Will~ Don't Let M~ 0CoSDzo~          3      0.952          0.663
##  4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm~          0      0.703          0.24 
##  5 Movie Fabien Nataf Ouverture    0IuslXp~          4      0.95           0.331
##  6 Movie Henri Salva~ Le petit so~ 0Mf1jKa~          0      0.749          0.578
##  7 Movie Martin & le~ Premières r~ 0NUiKYR~          2      0.344          0.703
##  8 Movie Laura Mayne  Let Me Let ~ 0PbIF9Y~         15      0.939          0.416
##  9 Movie Chorus       Helka        0ST6uPf~          0      0.00104        0.734
## 10 Movie Le Club des~ Les bisous ~ 0VSqZ3K~         10      0.319          0.598
## # ... with 176,764 more rows, and 11 more variables: duration_ms <dbl>,
## #   energy <dbl>, instrumentalness <dbl>, key <chr>, liveness <dbl>,
## #   loudness <dbl>, mode <chr>, speechiness <dbl>, tempo <dbl>,
## #   time_signature <chr>, valence <dbl>
#spotifydf[(spotifydf$genre=="Children's Music"),]
genres <- distinct(spotifydf, genre)$genre

Convert character format columns: key and mode to numeric values

unique(spotifydf$key)
##  [1] "C#" "F#" "C"  "F"  "G"  "E"  "D#" "G#" "D"  "A#" "A"  "B"
unique(as.numeric(as.factor(spotifydf$key)))
##  [1]  5 10  4  9 11  8  7 12  6  2  1  3
spotifydf$key <- as.numeric(as.factor(spotifydf$key))

unique(spotifydf$mode)
## [1] "Major" "Minor"
unique(as.numeric(as.factor(spotifydf$mode)))
## [1] 1 2
spotifydf$mode <- as.numeric(as.factor(spotifydf$mode))

Get all numeric columns from the dataframe.

str(spotifydf)
## spec_tbl_df [232,725 x 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ genre           : chr [1:232725] "Movie" "Movie" "Movie" "Movie" ...
##  $ artist_name     : chr [1:232725] "Henri Salvador" "Martin & les fées" "Joseph Williams" "Henri Salvador" ...
##  $ track_name      : chr [1:232725] "C'est beau de faire un Show" "Perdu d'avance (par Gad Elmaleh)" "Don't Let Me Be Lonely Tonight" "Dis-moi Monsieur Gordon Cooper" ...
##  $ track_id        : chr [1:232725] "0BRjO6ga9RKCKjfDqeFgWV" "0BjC1NfoEOOusryehmNudP" "0CoSDzoNIKCRs124s9uTVy" "0Gc6TVm52BwZD07Ki6tIvf" ...
##  $ popularity      : num [1:232725] 0 1 3 0 4 0 2 15 0 10 ...
##  $ acousticness    : num [1:232725] 0.611 0.246 0.952 0.703 0.95 0.749 0.344 0.939 0.00104 0.319 ...
##  $ danceability    : num [1:232725] 0.389 0.59 0.663 0.24 0.331 0.578 0.703 0.416 0.734 0.598 ...
##  $ duration_ms     : num [1:232725] 99373 137373 170267 152427 82625 ...
##  $ energy          : num [1:232725] 0.91 0.737 0.131 0.326 0.225 0.0948 0.27 0.269 0.481 0.705 ...
##  $ instrumentalness: num [1:232725] 0 0 0 0 0.123 0 0 0 0.00086 0.00125 ...
##  $ key             : num [1:232725] 5 10 4 5 9 5 5 10 4 11 ...
##  $ liveness        : num [1:232725] 0.346 0.151 0.103 0.0985 0.202 0.107 0.105 0.113 0.0765 0.349 ...
##  $ loudness        : num [1:232725] -1.83 -5.56 -13.88 -12.18 -21.15 ...
##  $ mode            : num [1:232725] 1 2 2 1 1 1 1 1 1 1 ...
##  $ speechiness     : num [1:232725] 0.0525 0.0868 0.0362 0.0395 0.0456 0.143 0.953 0.0286 0.046 0.0281 ...
##  $ tempo           : num [1:232725] 167 174 99.5 171.8 140.6 ...
##  $ time_signature  : chr [1:232725] "4/4" "4/4" "5/4" "4/4" ...
##  $ valence         : num [1:232725] 0.814 0.816 0.368 0.227 0.39 0.358 0.533 0.274 0.765 0.718 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   genre = col_character(),
##   ..   artist_name = col_character(),
##   ..   track_name = col_character(),
##   ..   track_id = col_character(),
##   ..   popularity = col_double(),
##   ..   acousticness = col_double(),
##   ..   danceability = col_double(),
##   ..   duration_ms = col_double(),
##   ..   energy = col_double(),
##   ..   instrumentalness = col_double(),
##   ..   key = col_character(),
##   ..   liveness = col_double(),
##   ..   loudness = col_double(),
##   ..   mode = col_character(),
##   ..   speechiness = col_double(),
##   ..   tempo = col_double(),
##   ..   time_signature = col_character(),
##   ..   valence = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
columns <- colnames(spotifydf)
columns
##  [1] "genre"            "artist_name"      "track_name"       "track_id"        
##  [5] "popularity"       "acousticness"     "danceability"     "duration_ms"     
##  [9] "energy"           "instrumentalness" "key"              "liveness"        
## [13] "loudness"         "mode"             "speechiness"      "tempo"           
## [17] "time_signature"   "valence"
numeric_columns <- unlist(lapply(spotifydf, is.numeric))
numeric_columns
##            genre      artist_name       track_name         track_id 
##            FALSE            FALSE            FALSE            FALSE 
##       popularity     acousticness     danceability      duration_ms 
##             TRUE             TRUE             TRUE             TRUE 
##           energy instrumentalness              key         liveness 
##             TRUE             TRUE             TRUE             TRUE 
##         loudness             mode      speechiness            tempo 
##             TRUE             TRUE             TRUE             TRUE 
##   time_signature          valence 
##            FALSE             TRUE
numeric_spotifydf <- spotifydf[,numeric_columns]
colnames(numeric_spotifydf)
##  [1] "popularity"       "acousticness"     "danceability"     "duration_ms"     
##  [5] "energy"           "instrumentalness" "key"              "liveness"        
##  [9] "loudness"         "mode"             "speechiness"      "tempo"           
## [13] "valence"

Use Corrr library to plot correlation based on feature groups.

corr_df <- correlate(numeric_spotifydf, quiet = TRUE)
corr_df
## # A tibble: 13 x 14
##    term             popularity acousticness danceability duration_ms  energy
##    <chr>                 <dbl>        <dbl>        <dbl>       <dbl>   <dbl>
##  1 popularity        NA             -0.381       0.257       0.00235  0.249 
##  2 acousticness      -0.381         NA          -0.365       0.0112  -0.726 
##  3 danceability       0.257         -0.365      NA          -0.126    0.326 
##  4 duration_ms        0.00235        0.0112     -0.126      NA       -0.0305
##  5 energy             0.249         -0.726       0.326      -0.0305  NA     
##  6 instrumentalness  -0.211          0.316      -0.365       0.0760  -0.379 
##  7 key               -0.000943       0.0143     -0.00700    -0.00351 -0.0104
##  8 liveness          -0.168          0.0690     -0.0417      0.0238   0.193 
##  9 loudness           0.363         -0.690       0.439      -0.0476   0.816 
## 10 mode               0.0706        -0.0559      0.0619      0.0116   0.0413
## 11 speechiness       -0.151          0.151       0.135      -0.0162   0.145 
## 12 tempo              0.0810        -0.238       0.0219     -0.0285   0.229 
## 13 valence            0.0601        -0.326       0.547      -0.142    0.437 
## # ... with 8 more variables: instrumentalness <dbl>, key <dbl>, liveness <dbl>,
## #   loudness <dbl>, mode <dbl>, speechiness <dbl>, tempo <dbl>, valence <dbl>
corr_df %>% 
  select(-term) %>% 
  map_dbl(~ mean(., na.rm = TRUE))
##       popularity     acousticness     danceability      duration_ms 
##      0.014184858     -0.184994439      0.073552896     -0.022415605 
##           energy instrumentalness              key         liveness 
##      0.107507980     -0.145097040     -0.006138012      0.036819088 
##         loudness             mode      speechiness            tempo 
##      0.088582298      0.012430743      0.046871552      0.014349281 
##          valence 
##      0.069672654
corr_df %>%
  focus(popularity:valence, mirror = TRUE) %>%
  rearrange(absolute = FALSE) %>% 
  shave() %>%
  rplot(colors = c("lightblue", "green"), legend = TRUE) 
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

corr_df2 <- cor(numeric_spotifydf)
col3 = hcl.colors(10, "YlOrRd", rev = TRUE)
corr1 <- corrplot(corr_df2, col=col3, method = 'number') 

corrplot(corr_df2,  col= c("lightblue", "blue"),method = 'color', order = 'alphabet') 

corrplot(corr_df2,  order = 'hclust', addrect = 2)

corrplot(corr_df2/2, col.lim=c(-0.5, 0.5))

### There is a high correlation between energy and loudness features and similarly there is a second highest correlation between valence and danceability where valence is positiveness or negativeness of a track defined by (cheerful vs sad, depressive tune, lyrics) #### For each feature, plot grouping w.r.t Popularity.

genres_df <- spotifydf %>%
   select(popularity, genre)%>%
    count(popularity, genre)
by_genres <- spotifydf %>% group_by(genre, popularity)
by_genres %>% summarise(
  popularity = mean(popularity),
)
## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.
## # A tibble: 1,802 x 2
## # Groups:   genre [26]
##    genre     popularity
##    <chr>          <dbl>
##  1 A Capella          0
##  2 A Capella          1
##  3 A Capella          2
##  4 A Capella          3
##  5 A Capella          4
##  6 A Capella          5
##  7 A Capella          6
##  8 A Capella          7
##  9 A Capella          8
## 10 A Capella          9
## # ... with 1,792 more rows
by_genre <- by_genres %>% summarise(n = n())
## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.
by_genre %>% summarise(n = sum(n)) %>% filter(n>0)
## # A tibble: 26 x 2
##    genre                n
##    <chr>            <int>
##  1 A Capella          119
##  2 Alternative       9263
##  3 Anime             8936
##  4 Blues             9023
##  5 Children’s Music 14756
##  6 Classical         9256
##  7 Comedy            9681
##  8 Country           8664
##  9 Dance             8701
## 10 Electronic        9377
## # ... with 16 more rows
#vtree(genres_df, "genre")
colnames(by_genre)
## [1] "genre"      "popularity" "n"
ggplot(genres_df)+
  geom_point(aes(x = genre, y= n))

genres_df %>% ggplot(aes(x = genre, y= n))+
  geom_line(aes(color = "genre")) 

ggplot(data=by_genre,aes(x = genre, y = n, fill=genre)) +
  geom_bar(stat="identity", width=0.5,  position=position_dodge())+
    #geom_text(aes(label=genre), vjust=1.6, color="white", size=3.5)
coord_flip()+
    #scale_fill_brewer(palette="Pastel1") + theme_minimal()+
    labs(title = "Spotify tracks by Genre in US", y= NULL)

    # geom_errorbar(aes(ymin=n-sd, ymax=n+sd), width=.2,
    #              position=position_dodge(.9))

    #geom_line(aes(color = "genre")) +
  #geom_point(aes(color = "genre")) +
  #geom_line(aes(y = n, color = "n")) +
  # geom_point(aes(y = n, color = "n")) +
  #   scale_y_log10() +
  #   theme(legend.position="bottom",
  #         axis.text.x = element_text(angle = 90)) +
set.seed(542863)
# cfspotify <- cforest(numeric_columns ~ ., data = numeric_spotifydf,
#                     control = cforest_unbiased(mtry = 2, ntree = 50,
#                                               minbucket = 5, 
#                                               minsplit = 10))

# spotifydf_permimp <- permimp(cfspotify, conditional = TRUE, progressBar = TRUE)
#plot(spotifydf_permimp, type = "bar")

The above Permutation Importance computations show poor results since values are not normalized and/or adjusting values influences popularity scores very heavily meaning deviation is significant.

X <- numeric_spotifydf %>% 
  select(-c(popularity))
y <- numeric_spotifydf %>%
  select(c(popularity))
summary( X)
##   acousticness     danceability     duration_ms          energy         
##  Min.   :0.0000   Min.   :0.0569   Min.   :  15387   Min.   :0.0000203  
##  1st Qu.:0.0376   1st Qu.:0.4350   1st Qu.: 182857   1st Qu.:0.3850000  
##  Median :0.2320   Median :0.5710   Median : 220427   Median :0.6050000  
##  Mean   :0.3686   Mean   :0.5544   Mean   : 235122   Mean   :0.5709577  
##  3rd Qu.:0.7220   3rd Qu.:0.6920   3rd Qu.: 265768   3rd Qu.:0.7870000  
##  Max.   :0.9960   Max.   :0.9890   Max.   :5552917   Max.   :0.9990000  
##  instrumentalness         key            liveness          loudness      
##  Min.   :0.0000000   Min.   : 1.000   Min.   :0.00967   Min.   :-52.457  
##  1st Qu.:0.0000000   1st Qu.: 4.000   1st Qu.:0.09740   1st Qu.:-11.771  
##  Median :0.0000443   Median : 6.000   Median :0.12800   Median : -7.762  
##  Mean   :0.1483012   Mean   : 6.344   Mean   :0.21501   Mean   : -9.570  
##  3rd Qu.:0.0358000   3rd Qu.: 9.000   3rd Qu.:0.26400   3rd Qu.: -5.501  
##  Max.   :0.9990000   Max.   :12.000   Max.   :1.00000   Max.   :  3.744  
##       mode        speechiness         tempo           valence      
##  Min.   :1.000   Min.   :0.0222   Min.   : 30.38   Min.   :0.0000  
##  1st Qu.:1.000   1st Qu.:0.0367   1st Qu.: 92.96   1st Qu.:0.2370  
##  Median :1.000   Median :0.0501   Median :115.78   Median :0.4440  
##  Mean   :1.348   Mean   :0.1208   Mean   :117.67   Mean   :0.4549  
##  3rd Qu.:2.000   3rd Qu.:0.1050   3rd Qu.:139.05   3rd Qu.:0.6600  
##  Max.   :2.000   Max.   :0.9670   Max.   :242.90   Max.   :1.0000
summary(y)
##    popularity    
##  Min.   :  0.00  
##  1st Qu.: 29.00  
##  Median : 43.00  
##  Mean   : 41.13  
##  3rd Qu.: 55.00  
##  Max.   :100.00
#xnew <- c(0.45,1.84,2.3)
#points(xnew,df(xnew),col=2)
genres_df <- spotifydf %>%
  select(-c(artist_name, track_name, track_id, time_signature, key))

Check linear fit between all features vs popularity

This is multiple regression since we have multiple predictors vs one response variable.

lmfit = lm(popularity ~ acousticness + danceability + duration_ms +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence, numeric_spotifydf )
summary(lmfit)
## 
## Call:
## lm(formula = popularity ~ acousticness + danceability + duration_ms + 
##     energy + instrumentalness + liveness + loudness + speechiness + 
##     tempo + valence, data = numeric_spotifydf)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -58.034 -10.215   1.413  11.058  57.368 
## 
## Coefficients:
##                        Estimate     Std. Error t value             Pr(>|t|)    
## (Intercept)       55.3882318323   0.3440857483 160.972 < 0.0000000000000002 ***
## acousticness     -11.9348863471   0.1547306909 -77.133 < 0.0000000000000002 ***
## danceability      17.7399712075   0.2421276181  73.267 < 0.0000000000000002 ***
## duration_ms        0.0000024569   0.0000002816   8.726 < 0.0000000000000002 ***
## energy            -5.6278359232   0.2794090244 -20.142 < 0.0000000000000002 ***
## instrumentalness  -4.3105903938   0.1330431074 -32.400 < 0.0000000000000002 ***
## liveness          -9.6705519516   0.2011430478 -48.078 < 0.0000000000000002 ***
## loudness           0.7150992187   0.0112616140  63.499 < 0.0000000000000002 ***
## speechiness       -8.1081609287   0.2298549766 -35.275 < 0.0000000000000002 ***
## tempo             -0.0042749795   0.0011180789  -3.824             0.000132 ***
## valence          -13.2258861142   0.1659321212 -79.707 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.92 on 232714 degrees of freedom
## Multiple R-squared:  0.2339, Adjusted R-squared:  0.2339 
## F-statistic:  7106 on 10 and 232714 DF,  p-value: < 0.00000000000000022
plot(lmfit)

# checking coefficients

coefficients(lmfit)
##      (Intercept)     acousticness     danceability      duration_ms 
##  55.388231832270 -11.934886347087  17.739971207543   0.000002456887 
##           energy instrumentalness         liveness         loudness 
##  -5.627835923246  -4.310590393787  -9.670551951580   0.715099218724 
##      speechiness            tempo          valence 
##  -8.108160928731  -0.004274979477 -13.225886114236
lmfit2 = lm(popularity ~ acousticness + danceability +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence + key+mode+(energy * loudness) + (valence * danceability) , numeric_spotifydf )
summary(lmfit2)
## 
## Call:
## lm(formula = popularity ~ acousticness + danceability + energy + 
##     instrumentalness + liveness + loudness + speechiness + tempo + 
##     valence + key + mode + (energy * loudness) + (valence * danceability), 
##     data = numeric_spotifydf)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -60.60 -10.16   1.42  10.98  57.75 
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)           48.642916   0.383028 126.996 < 0.0000000000000002 ***
## acousticness         -11.869822   0.154208 -76.973 < 0.0000000000000002 ***
## danceability          26.491613   0.379808  69.750 < 0.0000000000000002 ***
## energy                -4.806738   0.303869 -15.818 < 0.0000000000000002 ***
## instrumentalness      -4.074058   0.133798 -30.449 < 0.0000000000000002 ***
## liveness              -9.667267   0.201558 -47.963 < 0.0000000000000002 ***
## loudness               0.627526   0.012114  51.802 < 0.0000000000000002 ***
## speechiness           -8.102821   0.232924 -34.787 < 0.0000000000000002 ***
## tempo                 -0.008507   0.001124  -7.569   0.0000000000000376 ***
## valence                0.836879   0.480117   1.743               0.0813 .  
## key                    0.038810   0.009514   4.079   0.0000452207135986 ***
## mode                   1.895830   0.069785  27.167 < 0.0000000000000002 ***
## energy:loudness        0.269525   0.024068  11.198 < 0.0000000000000002 ***
## danceability:valence -23.230354   0.749642 -30.989 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 15.87 on 232711 degrees of freedom
## Multiple R-squared:  0.2393, Adjusted R-squared:  0.2392 
## F-statistic:  5630 on 13 and 232711 DF,  p-value: < 0.00000000000000022
plot(lmfit2)

spotify_histograms <- numeric_spotifydf[,-c(4)]
plot_num(spotify_histograms)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.

ggplot(spotifydf, aes(popularity)) +
  geom_density()

ggplot(spotifydf, aes(energy)) +
  geom_density()

ggplot(spotifydf, aes(danceability)) +
  geom_density()

ggplot(spotifydf, aes(key)) +
  geom_density()

spotify_summary <- datasummary_skim(numeric_spotifydf)
spotify_summary
Unique (#) Missing (%) Mean SD Min Median Max
popularity 101 0 41.1 18.2 0.0 43.0 100.0
acousticness 4734 0 0.4 0.4 0.0 0.2 1.0
danceability 1295 0 0.6 0.2 0.1 0.6 1.0
duration_ms 70749 0 235122.3 118935.9 15387.0 220427.0 5552917.0
energy 2517 0 0.6 0.3 0.0 0.6 1.0
instrumentalness 5400 0 0.1 0.3 0.0 0.0 1.0
key 12 0 6.3 3.5 1.0 6.0 12.0
liveness 1732 0 0.2 0.2 0.0 0.1 1.0
loudness 27923 0 -9.6 6.0 -52.5 -7.8 3.7
mode 2 0 1.3 0.5 1.0 1.0 2.0
speechiness 1641 0 0.1 0.2 0.0 0.1 1.0
tempo 78512 0 117.7 30.9 30.4 115.8 242.9
valence 1692 0 0.5 0.3 0.0 0.4 1.0
models <- list(
  "OLS"     = lmfit2,
  "GLM" = glm(popularity ~ acousticness + danceability +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence + key+mode+(energy * loudness) + (valence * danceability) , data = numeric_spotifydf , family = gaussian)
)
modelsummary(models,
             fmt = 1,
               estimate  = c(
                "{estimate} ({std.error})",
                "{estimate} [{conf.low}, {conf.high}]"),
               statistic = NULL,
              coef_omit = "Intercept",
             output = "table1.png")
## save_kable will have the best result with magick installed.
modelsummary(models,gof_omit = ".*",
               statistic = c("conf.int",
                           "s.e. = {std.error}", 
                           "t = {statistic}",
                           "p = {p.value}"),
             output = "table2.png")
## save_kable will have the best result with magick installed.

Create train and test sets

train = sample(1:nrow(numeric_spotifydf), 300)
rf.spotify = randomForest(popularity~., data = numeric_spotifydf, subset = train)
rf.spotify
## 
## Call:
##  randomForest(formula = popularity ~ ., data = numeric_spotifydf,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 226.7792
##                     % Var explained: 26.54
oob.err = double(17)
test.err = double(17)
for(mtry in 1:17){
    fit = randomForest(popularity~., data = numeric_spotifydf, subset=train, mtry=mtry, ntree = 350)
      oob.err[mtry] = fit$mse[350]
      pred = predict(fit, numeric_spotifydf[-train,])
      test.err[mtry] = with(numeric_spotifydf[-train,], mean( (popularity-pred)^2 ))
}
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range

Decision Trees 17*350 trees with MSE for OOB and Test errors

matplot(1:mtry, cbind(test.err, oob.err), pch = 23, col = c("red", "blue"), type = "b", ylab="Mean Squared Error", lwd=6)
legend("topright", legend = c("OOB", "Test"), pch = 23, col = c("red", "blue"))

oob.err 
##  [1] 236.2436 230.5873 230.8398 227.5230 225.2099 229.2312 229.9762 226.8571
##  [9] 229.3779 229.5540 231.0414 228.9525 231.3045 235.7410 235.4476 230.5074
## [17] 228.1559
test.err
##  [1] 253.6560 246.5739 245.8398 246.8088 246.5483 245.8082 247.1158 247.3210
##  [9] 248.3818 248.9581 248.1222 249.0326 248.3558 249.5552 249.6177 248.9057
## [17] 250.2120

Basic Random Forest

rf.spotify1 = randomForest(popularity~., data = numeric_spotifydf, subset = train, importance = TRUE)
rf.spotify1
## 
## Call:
##  randomForest(formula = popularity ~ ., data = numeric_spotifydf,      importance = TRUE, subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 230.6748
##                     % Var explained: 25.28

Use Random forest to find Variance based Feature Importances.

rf.spotify1$importance
##                     %IncMSE IncNodePurity
## acousticness     34.1745592     9830.9960
## danceability     12.1398562     8347.2107
## duration_ms      50.4042676    14207.0226
## energy           17.3363787     7676.4940
## instrumentalness 19.1327507     5464.8742
## key              -0.1814309     4036.2543
## liveness          5.0198167     5610.3812
## loudness         51.8448210    12817.5253
## mode             -0.3448481      688.7333
## speechiness      16.3756640     6799.7127
## tempo             4.5693007     5384.1999
## valence           8.8968408     7107.0085
rf.spotify1$importanceSD
##     acousticness     danceability      duration_ms           energy 
##        2.5823897        1.9772615        2.3262753        2.2252300 
## instrumentalness              key         liveness         loudness 
##        1.6873494        1.1869525        1.4640788        3.4224780 
##             mode      speechiness            tempo          valence 
##        0.4675082        1.6338029        1.3380289        1.6947052
create_rfplot <- function(rf, type){
  
  imp <- importance(rf, type = type, scale = F)
  
  featureImportance <- data.frame(Feature = row.names(imp), Importance = imp[,1])
  
  p <- ggplot(featureImportance, aes(x = reorder(Feature, Importance), y = Importance)) +
       geom_bar(stat = "identity", fill = featureImportance$Importance, width = 0.65) +
       coord_flip() + 
       theme_light(base_size = 25) +
       theme(axis.title.x = element_text(size = 20, color = "black"),
             axis.title.y = element_blank(),
             axis.text.x  = element_text(size = 20, color = "black"),
             axis.text.y  = element_text(size = 20, color = "black")) 
  return(p)
}
create_rfplot(rf.spotify1, type = 2)

data.frame(Feature = row.names(rf.spotify1$importance), Importance = rf.spotify1$importance[,1])
##                           Feature Importance
## acousticness         acousticness 34.1745592
## danceability         danceability 12.1398562
## duration_ms           duration_ms 50.4042676
## energy                     energy 17.3363787
## instrumentalness instrumentalness 19.1327507
## key                           key -0.1814309
## liveness                 liveness  5.0198167
## loudness                 loudness 51.8448210
## mode                         mode -0.3448481
## speechiness           speechiness 16.3756640
## tempo                       tempo  4.5693007
## valence                   valence  8.8968408
rf.spotify2 = randomForest(popularity~., data = numeric_spotifydf, subset = train, replace = FALSE, nodesize = 7, keep.forest = TRUE, keep.inbag = TRUE)
rf.spotify2
## 
## Call:
##  randomForest(formula = popularity ~ ., data = numeric_spotifydf,      replace = FALSE, nodesize = 7, keep.forest = TRUE, keep.inbag = TRUE,      subset = train) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##           Mean of squared residuals: 228.1065
##                     % Var explained: 26.11
permimp <- permimp(rf.spotify2, conditional = TRUE, progressBar = FALSE, do_check=FALSE)
permimp$values
##     acousticness     danceability      duration_ms           energy 
##       2.34837180       1.96291065      35.65975844       1.21448208 
## instrumentalness              key         liveness         loudness 
##       1.26052014       0.01857131       0.71891768       4.86700464 
##             mode      speechiness            tempo          valence 
##       0.05128700       6.41444603       3.32140668       3.90311899
ggplot(as.data.frame(permimp$values), aes(x = reorder(names(permimp$values)
, as.numeric(permimp$values)), y = as.numeric(permimp$values))) +
       geom_bar(stat = "identity", fill = as.factor(seq(0,11)), width = 0.65) +
       coord_flip() + 
       theme_light(base_size = 25) +
       theme(axis.title.x = element_text(size = 20, color = "black"),
             axis.title.y = element_blank(),
             axis.text.x  = element_text(size = 20, color = "black"),
             axis.text.y  = element_text(size = 20, color = "black"))+xlab("Features")+ylab("Importance")